import pandas as pd
import numpy as np
# Visulaiztion Libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from yellowbrick.regressor import ResidualsPlot
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%matplotlib inline
# For Saving Files
import os
# Model Training And Testing libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, RepeatedKFold
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
# Best Features Selection For Each Category Libraries
from sklearn.feature_selection import SelectKBest, f_regression
# Saving Model
import pickle
# Profiling Libraries
from ydata_profiling import ProfileReport
movies = pd.read_csv(r"D:\Projects\Python\CodeSoft Internship\Movie Project\IMDb Movies India.csv", encoding='latin1')
profile = ProfileReport(movies, title="Movies Profiling Report", explorative=True)
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
# Save the report to the specified path
profile.to_file(r"D:\Projects\CodeSoft Internship\Movie Project\IMDb Movies India.csv")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
D:\Anaconda\Lib\site-packages\ydata_profiling\profile_report.py:360: UserWarning: Extension .csv not supported. For now we assume .html was intended. To remove this warning, please use .html or .json. warnings.warn(
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
movies.head()
| Name | Year | Duration | Genre | Rating | Votes | Director | Actor 1 | Actor 2 | Actor 3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | Drama | NaN | NaN | J.S. Randhawa | Manmauji | Birbal | Rajendra Bhatia | |
| 1 | #Gadhvi (He thought he was Gandhi) | (2019) | 109 min | Drama | 7.0 | 8 | Gaurav Bakshi | Rasika Dugal | Vivek Ghamande | Arvind Jangid |
| 2 | #Homecoming | (2021) | 90 min | Drama, Musical | NaN | NaN | Soumyajit Majumdar | Sayani Gupta | Plabita Borthakur | Roy Angana |
| 3 | #Yaaram | (2019) | 110 min | Comedy, Romance | 4.4 | 35 | Ovais Khan | Prateik | Ishita Raj | Siddhant Kapoor |
| 4 | ...And Once Again | (2010) | 105 min | Drama | NaN | NaN | Amol Palekar | Rajat Kapoor | Rituparna Sengupta | Antara Mali |
movies.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 15509 entries, 0 to 15508 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 15509 non-null object 1 Year 14981 non-null object 2 Duration 7240 non-null object 3 Genre 13632 non-null object 4 Rating 7919 non-null float64 5 Votes 7920 non-null object 6 Director 14984 non-null object 7 Actor 1 13892 non-null object 8 Actor 2 13125 non-null object 9 Actor 3 12365 non-null object dtypes: float64(1), object(9) memory usage: 1.2+ MB
movies.shape
(15509, 10)
movies.columns
Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
'Actor 1', 'Actor 2', 'Actor 3'],
dtype='object')
def percent_counts(df, feature):
total = df[feature].value_counts(dropna=False)
percent = round(df[feature].value_counts(dropna=False, normalize=True) * 100, 2)
percent_count = pd.concat([total, percent], keys=['Total', 'Percentage'], axis=1)
return percent_count
percent_counts(movies, 'Genre')
| Total | Percentage | |
|---|---|---|
| Drama | 2780 | 17.93 |
| NaN | 1877 | 12.10 |
| Action | 1289 | 8.31 |
| Thriller | 779 | 5.02 |
| Romance | 708 | 4.57 |
| ... | ... | ... |
| Action, Musical, War | 1 | 0.01 |
| Horror, Crime, Thriller | 1 | 0.01 |
| Animation, Comedy | 1 | 0.01 |
| Romance, Action, Crime | 1 | 0.01 |
| Adventure, Fantasy, Sci-Fi | 1 | 0.01 |
486 rows × 2 columns
percent_counts(movies, 'Rating')
| Total | Percentage | |
|---|---|---|
| NaN | 7590 | 48.94 |
| 6.2 | 269 | 1.73 |
| 6.8 | 264 | 1.70 |
| 6.5 | 254 | 1.64 |
| 6.6 | 239 | 1.54 |
| ... | ... | ... |
| 9.7 | 1 | 0.01 |
| 1.4 | 1 | 0.01 |
| 10.0 | 1 | 0.01 |
| 9.6 | 1 | 0.01 |
| 1.1 | 1 | 0.01 |
85 rows × 2 columns
# Path to save the file
output_path = r"C:\Users\acer\Downloads\CodeSoft Internship\Movie Project\movies_percent_counts.xlsx"
# Create a Pandas Excel writer using XlsxWriter as the engine.
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
for feature in movies.columns:
df_counts = percent_counts(movies, feature)
# Convert the DataFrame to an XlsxWriter Excel object.
df_counts.to_excel(writer, sheet_name=feature)
print(f"Report saved to {output_path}")
# Checking for missing values
missing_values = movies.isnull().sum()
# Print missing values
print("Missing values: \n")
missing_values
Missing values:
Name 0 Year 528 Duration 8269 Genre 1877 Rating 7590 Votes 7589 Director 525 Actor 1 1617 Actor 2 2384 Actor 3 3144 dtype: int64
#Locating rows with missing values in columns from 1 to 9
nulls = movies[movies.iloc[:, 1:9].isnull().all(axis=1)]
nulls.head(7)
| Name | Year | Duration | Genre | Rating | Votes | Director | Actor 1 | Actor 2 | Actor 3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1836 | Bang Bang Reloaded | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1920 | Battle of bittora | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2653 | Campus | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3403 | Dancing Dad | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3807 | Dial 100 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4271 | Durga Rani Singh | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 8669 | Main Hoon Kaun | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# Handling the null values
movies.dropna(subset=['Name', 'Year', 'Genre', 'Duration', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], inplace=True)
# Extracting only the text part from the 'Name' column using a different method
movies['Name'] = movies['Name'].str.extract(r'([A-Za-z\s\'\-]+)')
# Replacing the brackets from 'Year' column and converting to int using different method
movies['Year'] = movies['Year'].apply(lambda x: int(x.strip('()')))
# Converting 'Duration' to numeric and replacing 'min' while keeping only numerical part using different method
movies['Duration'] = movies['Duration'].str.replace(' min', '').astype(float)
# Converting 'Votes' to numeric and replacing commas using different method
movies['Votes'] = movies['Votes'].apply(lambda x: float(x.replace(',', '')))
# Splitting the genre by ',' to keep only unique genres and replacing the null values with mode
movies['Genre'] = movies['Genre'].str.split(', ')
movies = movies.explode('Genre')
movies['Genre'].fillna(movies['Genre'].mode()[0], inplace=True)
# Check for duplicate rows in the DataFrame
data_dup = movies.duplicated().any()
print(f"Are there duplicate rows in the dataset? {data_dup}")
# Find and print columns with duplicate values and their counts
duplicate_columns = {}
for column in movies.columns:
duplicates = movies[column].duplicated(keep=False)
if duplicates.any():
duplicate_counts = movies[column].value_counts()[movies[column].value_counts() > 1]
duplicate_columns[column] = duplicate_counts
# Print the columns with their duplicate values and counts
for column, counts in duplicate_columns.items():
print(f"\nColumn: {column}")
Are there duplicate rows in the dataset? True Column: Name Column: Year Column: Duration Column: Genre Column: Rating Column: Votes Column: Director Column: Actor 1 Column: Actor 2 Column: Actor 3
# Convert the dictionary to a DataFrame
df_duplicate_info = pd.DataFrame.from_dict(duplicate_columns, orient='index').transpose()
# Save the DataFrame to an Excel file
output_path = r"C:\Users\acer\Downloads\CodeSoft Internship\Duplicate_columns.xlsx"
df_duplicate_info.to_excel(output_path, index=False)
print(f"Duplicate columns information saved to {output_path}")
Duplicate columns information saved to C:\Users\acer\Downloads\CodeSoft Internship\Duplicate_columns.xlsx
movies = movies.drop_duplicates(subset=['Name', 'Year', 'Genre', 'Rating', 'Votes', 'Rating', 'Director', 'Actor 1', 'Actor 2', 'Actor 3' ], keep=False, inplace=False)
dict = {}
for i in list(movies.columns):
dict[i] = movies[i].value_counts().shape[0]
unique = pd.DataFrame(dict,index=["unique count"]).transpose()
unique
| unique count | |
|---|---|
| Name | 5282 |
| Year | 91 |
| Duration | 174 |
| Genre | 22 |
| Rating | 83 |
| Votes | 2027 |
| Director | 2431 |
| Actor 1 | 1960 |
| Actor 2 | 2321 |
| Actor 3 | 2556 |
# Check for typos and use more descriptive variable names
movies_obj = movies.select_dtypes(include="object")
with pd.ExcelWriter(r"C:\Users\acer\Downloads\CodeSoft Internship\Movie Project\movie_column_unique_values.xlsx") as writer:
movies_obj.to_excel(writer, sheet_name="unique_values")
print("Report saved to Excel file")
Report saved to Excel file
continuous_values = []
categorical_values = []
for column in movies.columns:
if movies[column].dtype == 'int64' or movies[column].dtype == 'float64':
continuous_values.append(column)
else:
categorical_values.append(column)
print(f"Categorical columns: {categorical_values}")
print(f"Continuous columns: {continuous_values}")
Categorical columns: ['Name', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'] Continuous columns: ['Year', 'Duration', 'Rating', 'Votes']
movies.head()
| Name | Year | Duration | Genre | Rating | Votes | Director | Actor 1 | Actor 2 | Actor 3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Gadhvi | 2019 | 109 | Drama | 7.0 | 8 | Gaurav Bakshi | Rasika Dugal | Vivek Ghamande | Arvind Jangid |
| 3 | Yaaram | 2019 | 110 | Comedy | 4.4 | 35 | Ovais Khan | Prateik | Ishita Raj | Siddhant Kapoor |
| 3 | Yaaram | 2019 | 110 | Romance | 4.4 | 35 | Ovais Khan | Prateik | Ishita Raj | Siddhant Kapoor |
| 5 | Aur Pyaar Ho Gaya | 1997 | 147 | Comedy | 4.7 | 827 | Rahul Rawail | Bobby Deol | Aishwarya Rai Bachchan | Shammi Kapoor |
| 5 | Aur Pyaar Ho Gaya | 1997 | 147 | Drama | 4.7 | 827 | Rahul Rawail | Bobby Deol | Aishwarya Rai Bachchan | Shammi Kapoor |
movies.isnull().sum()
Name 0 Year 0 Duration 0 Genre 0 Rating 0 Votes 0 Director 0 Actor 1 0 Actor 2 0 Actor 3 0 dtype: int64
def outlier_detect(df, col):
q1_col = Q1[col]
iqr_col = IQR[col]
q3_col = Q3[col]
return df[((df[col] < (q1_col - 1.5 * iqr_col)) |(df[col] > (q3_col + 1.5 * iqr_col)))]
# ---------------------------------------------------------
def outlier_detect_categorical(df, col):
if df[col].dtype == 'object':
# For categorical columns, check for unique or unusual values
value_counts = df[col].value_counts()
return value_counts[value_counts < 5] # Consider values with less than 5 occurrences as outliers
else:
# For numerical columns, use IQR method
q1_col = Q1[col]
iqr_col = IQR[col]
q3_col = Q3[col]
return df[((df[col] < (q1_col - 1.5 * iqr_col)) | (df[col] > (q3_col + 1.5 * iqr_col)))]
# ---------------------------------------------------------
def outlier_detect_normal(df, col):
m = df[col].mean()
s = df[col].std()
return df[((df[col]-m)/s).abs()>3]
# ---------------------------------------------------------
def lower_outlier(df, col):
q1_col = Q1[col]
iqr_col = IQR[col]
q3_col = Q3[col]
lower = df[(df[col] < (q1_col - 1.5 * iqr_col))]
return lower
# ---------------------------------------------------------
def upper_outlier(df, col):
q1_col = Q1[col]
iqr_col = IQR[col]
q3_col = Q3[col]
upper = df[(df[col] > (q3_col + 1.5 * iqr_col))]
return upper
# ---------------------------------------------------------
def replace_upper(df, col):
q1_col = Q1[col]
iqr_col = IQR[col]
q3_col = Q3[col]
tmp = 9999999
upper = q3_col + 1.5 * iqr_col
df[col] = df[col].where(lambda x: (x < (upper)), tmp)
df[col] = df[col].replace(tmp, upper)
print('outlier replace with upper bound - {}' .format(col))
# ---------------------------------------------------------
def replace_lower(df, col):
q1_col = Q1[col]
iqr_col = IQR[col]
q3_col = Q3[col]
tmp = 1111111
lower = q1_col - 1.5 * iqr_col
df[col] = df[col].where(lambda x: (x > (lower)), tmp)
df[col] = df[col].replace(tmp, lower)
print('outlier replace with lower bound - {}' .format(col))
# ---------------------------------------------------------
Q1 = movies.quantile(0.25, numeric_only=True)
Q3 = movies.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1
for i in range(len(continuous_values)):
print("IQR => {}: {}".format(continuous_values[i], outlier_detect(movies, continuous_values[i]).shape[0]))
print("Z_Score => {}: {}".format(continuous_values[i], outlier_detect_normal(movies, continuous_values[i]).shape[0]))
print("********************************")
IQR => Year: 26 Z_Score => Year: 40 ******************************** IQR => Duration: 174 Z_Score => Duration: 101 ******************************** IQR => Rating: 60 Z_Score => Rating: 19 ******************************** IQR => Votes: 1882 Z_Score => Votes: 136 ********************************
outlier = []
for i in range(len(continuous_values)):
if outlier_detect(movies[continuous_values],continuous_values[i]).shape[0] !=0:
outlier.append(continuous_values[i])
outlier
for i in range(len(outlier)):
replace_upper(movies, outlier[i])
print("\n********************************\n")
for i in range(len(outlier)):
replace_lower(movies, outlier[i])
Q1 = movies.quantile(0.25, numeric_only=True)
Q3 = movies.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1
for i in range(len(continuous_values)):
print("IQR => {}: {}".format(continuous_values[i], outlier_detect(movies, continuous_values[i]).shape[0]))
print("Z_Score => {}: {}".format(continuous_values[i], outlier_detect_normal(movies, continuous_values[i]).shape[0]))
print("********************************")
outlier replace with upper bound - Year outlier replace with upper bound - Duration outlier replace with upper bound - Rating outlier replace with upper bound - Votes ******************************** outlier replace with lower bound - Year outlier replace with lower bound - Duration outlier replace with lower bound - Rating outlier replace with lower bound - Votes IQR => Year: 0 Z_Score => Year: 40 ******************************** IQR => Duration: 0 Z_Score => Duration: 0 ******************************** IQR => Rating: 0 Z_Score => Rating: 0 ******************************** IQR => Votes: 0 Z_Score => Votes: 0 ********************************
GadhviDrama (1,617 entries)K (115 entries)Jeetendra (82 entries)Rekha (44 entries)Pran (46 entries)Richness in Diversity:
Year Range:
Vote and Rating Analysis:
Duration Analysis:
✨ Let the cinematic exploration begin! 🌟
Outliers Identified Using:
Outliers Replaced with Upper and Lower Bounds for:
Null Value Handling:
Name, Year, Genre, Duration, Rating, Votes, Director, Actor 1, Actor 2, Actor 3.Text Extraction:
Name column using regex to remove any non-alphabetic characters.Year Formatting:
Year column and converted the values to integers for consistency.Duration Conversion:
Duration column and converted it to a numeric format, enabling easier analysis of movie lengths.Votes Conversion:
Votes column and converted it to a numeric format to facilitate statistical operations.Genre Normalization:
Name, Year, Genre, Rating, Votes, Director, Actor 1, Actor 2, Actor 3.✨ The dataset is now prepared for detailed analysis, providing a reliable foundation for exploring trends and patterns in the movie industry! 🌟
movies.describe()
| Year | Duration | Rating | Votes | Genre_mean_rating | Director_encoded | Actor1_encoded | Actor2_encoded | Actor3_encoded | |
|---|---|---|---|---|---|---|---|---|---|
| count | 11963.000000 | 11963.000000 | 11963.000000 | 11963.000000 | 11963.000000 | 11963.000000 | 11963.000000 | 11963.000000 | 11963.000000 |
| mean | 1996.077656 | 135.868762 | 5.878851 | 3333.946418 | 5.878851 | 5.878851 | 5.878851 | 5.878851 | 5.878851 |
| std | 19.317122 | 25.027876 | 1.374141 | 15314.164524 | 0.309586 | 1.098326 | 1.001887 | 1.033647 | 1.046030 |
| min | 1931.000000 | 21.000000 | 1.100000 | 5.000000 | 4.811000 | 1.600000 | 1.400000 | 1.600000 | 1.700000 |
| 25% | 1982.000000 | 120.000000 | 5.000000 | 41.000000 | 5.749888 | 5.246154 | 5.292857 | 5.267857 | 5.279167 |
| 50% | 2001.000000 | 137.000000 | 6.100000 | 212.000000 | 5.818421 | 6.000000 | 6.000000 | 5.945455 | 5.960000 |
| 75% | 2012.000000 | 152.000000 | 6.900000 | 1457.000000 | 6.055878 | 6.700000 | 6.575281 | 6.561818 | 6.575410 |
| max | 2021.000000 | 321.000000 | 10.000000 | 591417.000000 | 8.000000 | 10.000000 | 10.000000 | 10.000000 | 10.000000 |
movies.describe(include='object')
| Name | Genre | Director | Actor 1 | Actor 2 | Actor 3 | |
|---|---|---|---|---|---|---|
| count | 4990 | 4990 | 4990 | 4990 | 4990 | 4990 |
| unique | 4990 | 21 | 2149 | 1853 | 2206 | 2412 |
| top | Gadhvi | Drama | K | Jeetendra | Rekha | Pran |
| freq | 1 | 1617 | 115 | 82 | 44 | 46 |
# Sort the DataFrame by 'Votes' column in descending order
sorted_movies = movies.sort_values(by='Votes', ascending=False)
# Get top 3 movies with the highest number of votes
top_3_highest_votes = sorted_movies.head(3)
# Get bottom 3 movies with the lowest number of votes
bottom_3_lowest_votes = sorted_movies.tail(3)
# Convert top 3 movies with the highest votes to DataFrame
top_3_df = pd.DataFrame(top_3_highest_votes[['Name', 'Votes', 'Year', 'Rating']])
# Convert bottom 3 movies with the lowest votes to DataFrame
bottom_3_df = pd.DataFrame(bottom_3_lowest_votes[['Name', 'Votes', 'Year', 'Rating']])
print("\nTop 3 Movies By Votes")
top_3_df
Top 3 Movies By Votes
| Name | Votes | Year | Rating | |
|---|---|---|---|---|
| 8219 | Life of Pi | 591417 | 2012 | 7.9 |
| 75 | Idiots | 357889 | 2009 | 8.4 |
| 8233 | Lion | 220526 | 2016 | 8.0 |
print("\nBottom 3 Movies By Votes")
bottom_3_df
Bottom 3 Movies By Votes
| Name | Votes | Year | Rating | |
|---|---|---|---|---|
| 11628 | Rakshaa Bandhan | 5 | 1977 | 7.6 |
| 3361 | Daku Kali Bhawani | 5 | 2000 | 3.8 |
| 4384 | Ek Daku Saher Mein | 5 | 1985 | 3.8 |
# Sort the DataFrame by 'Votes' column in descending order
sorted_movies = movies.sort_values(by='Rating', ascending=False)
# Get top 3 movies with the highest number of votes
top_3_highest_votes = sorted_movies.head(3)
# Get bottom 3 movies with the lowest number of votes
bottom_3_lowest_votes = sorted_movies.tail(3)
# Convert top 3 movies with the highest votes to DataFrame
top_3_df = pd.DataFrame(top_3_highest_votes[['Name', 'Rating', 'Year', 'Votes']])
# Convert bottom 3 movies with the lowest votes to DataFrame
bottom_3_df = pd.DataFrame(bottom_3_lowest_votes[['Name', 'Rating', 'Year', 'Votes']])
print("\nTop 3 Movies By Rating")
top_3_df
Top 3 Movies By Rating
| Name | Rating | Year | Votes | |
|---|---|---|---|---|
| 8339 | Love Qubool Hai | 10.0 | 2020 | 5 |
| 5410 | Half Songs | 9.7 | 2021 | 7 |
| 14222 | The Reluctant Crime | 9.4 | 2020 | 16 |
print("\nBottom 3 Movies By Rating")
bottom_3_df
Bottom 3 Movies By Rating
| Name | Rating | Year | Votes | |
|---|---|---|---|---|
| 15040 | Welcome to New York | 1.6 | 2018 | 774 |
| 3618 | Desh Drohi | 1.4 | 2008 | 3899 |
| 12171 | Sadak | 1.1 | 2020 | 67785 |
# Group movies by year
movies_by_year = movies.groupby('Year')
# Apply a function to find the movie with the highest rating within each year group
def get_top_rated_movie(group):
return group.sort_values(by='Rating', ascending=False).head(1)
# Apply the function to each year group and store the result in a new DataFrame
top_rated_movies_per_year = movies_by_year.apply(get_top_rated_movie)
# Reset the index to remove the multi-level indexing
top_rated_movies_per_year = top_rated_movies_per_year.reset_index(drop=True)
top_rated_movies_per_year
| Name | Year | Duration | Genre | Rating | Votes | Director | Actor 1 | Actor 2 | Actor 3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | The Light of the World | 1931 | 124 | Drama | 6.2 | 112 | Ardeshir Irani | Master Vithal | Zubeida | Jillo |
| 1 | Indrasabha | 1932 | 211 | Musical | 6.0 | 12 | J | Nissar | Jehanara Kajjan | Abdul Rehman Kabuli |
| 2 | Puran Bhagat | 1933 | 159 | Unknown | 6.5 | 10 | Debaki Bose | Choudhury Mohammed Rafiq | M | Anwari |
| 3 | Chandidas | 1934 | 128 | Unknown | 6.6 | 5 | Nitin Bose | K | Umasashi | Pahadi Sanyal |
| 4 | Inquilab | 1935 | 144 | Drama | 7.4 | 38 | Debaki Bose | Durga Khote | Prithviraj Kapoor | Syed Mohammed |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 86 | Rediscovering India | 2017 | 124 | Documentary | 9.0 | 62 | Meenal Dixit | Benny John | Benny John | Meenal Dixit |
| 87 | Ashok Vatika | 2018 | 97 | Drama | 9.3 | 7 | Rahul Mallick | Kunj Anand | Sanjay Bishnoi | Paras Zutshi |
| 88 | Gho Gho Rani | 2019 | 105 | History | 9.4 | 47 | Munni Pankaj | Nishi Neha Mishra | Pankaj Kamal | Akash Kumar |
| 89 | Love Qubool Hai | 2020 | 94 | Drama | 10.0 | 5 | Saif Ali Sayeed | Ahaan Jha | Mahesh Narayan | Rajasree Rajakumari |
| 90 | Half Songs | 2021 | 79 | Music | 9.7 | 7 | Sriram Raja | Raj Banerjee | Emon Chatterjee | Purshottam Mulani |
91 rows × 10 columns
# Specify the desired file path (replace with your actual path)
file_path = "C:\\Users\\acer\\Downloads\\CodeSoft Internship\\Movie Project\\top_rated_movies_per_year.xlsx"
# Write the DataFrame to the Excel file (without index)
top_rated_movies_per_year.to_excel(file_path, index=False)
print("Top rated movies per year saved to", file_path)
Top rated movies per year saved to C:\Users\acer\Downloads\CodeSoft Internship\Movie Project\top_rated_movies_per_year.xlsx
director_counts = movies['Director'].value_counts()
# Sort directors by movie count (descending order) and get the top 3
top_3_directors = director_counts.nlargest(3)
print("Top 3 directors with the most movies :")
top_3_directors
Top 3 directors with the most movies :
K 115 S 78 R 40 Name: Director, dtype: int64
# Find the director with the highest rated movie
top_rated_movie = movies.sort_values(by='Rating', ascending=False).head(1)
top_rated_director = top_rated_movie['Director'].values[0] # Extract director name
# Find the director with the lowest rated movie
bottom_rated_movie = movies.sort_values(by='Rating', ascending=True).head(1)
bottom_rated_director = bottom_rated_movie['Director'].values[0] # Extract director name
# Find the director with the most voted movie
most_voted_movie = movies.sort_values(by='Votes', ascending=False).head(1)
most_voted_director = most_voted_movie['Director'].values[0] # Extract director name
# Find the director with the least voted movie
least_voted_movie = movies.sort_values(by='Votes', ascending=True).head(1)
least_voted_director = least_voted_movie['Director'].values[0] # Extract director name
print("\nDirector with the highest rated movie:", top_rated_director, "\n")
print(top_rated_movie[['Name', 'Year', 'Rating', 'Genre']].to_string(index=False)) # Display movie details
print('\n', '='*100, '\n')
print("\nDirector with the lowest rated movie:", bottom_rated_director , "\n")
print(bottom_rated_movie[['Name', 'Year', 'Rating', 'Genre']].to_string(index=False)) # Display movie details
print('\n', '='*100, '\n')
print("\nDirector with the most voted movie:", most_voted_director, "\n")
print(most_voted_movie[['Name', 'Year', 'Votes', 'Rating', 'Genre']].to_string(index=False)) # Display movie details
print('\n', '='*100, '\n')
print("\nDirector with the least voted movie:", least_voted_director, "\n")
print(least_voted_movie[['Name', 'Year', 'Votes', 'Rating', 'Genre']].to_string(index=False)) # Display movie details
Director with the highest rated movie: Saif Ali Sayeed
Name Year Rating Genre
Love Qubool Hai 2020 10.0 Drama
====================================================================================================
Director with the lowest rated movie: Mahesh Bhatt
Name Year Rating Genre
Sadak 2020 1.1 Action
====================================================================================================
Director with the most voted movie: Ang Lee
Name Year Votes Rating Genre
Life of Pi 2012 591417 7.9 Adventure
====================================================================================================
Director with the least voted movie: Saif Ali Sayeed
Name Year Votes Rating Genre
Love Qubool Hai 2020 5 10.0 Drama
# Count movies in each genre
genre_counts = movies['Genre'].value_counts()
# Get the top 3 genres with the most movies (using nlargest)
top_3_genres = genre_counts.nlargest(3)
print("Top 3 genres with the most movies:")
print(top_3_genres)
Top 3 genres with the most movies: Drama 1617 Action 1389 Comedy 921 Name: Genre, dtype: int64
# Count movies in each genre
genre_counts = movies['Genre'].value_counts()
# Get the top 3 genres with the most movies (using nlargest)
top_3_genres = genre_counts.nlargest(3)
# Loop through top 3 genres
for i, genre in enumerate(top_3_genres.index):
# Genre numbering and highlighting
print(f"\n** Genre {i+1}: {genre.upper()} **") # Uppercase for highlighting
# Filter movies for the current genre
genre_movies = movies[movies['Genre'] == genre]
# Find the top rated movie within the genre
top_rated_movie = genre_movies.sort_values(by='Rating', ascending=False).head(1)
# Find the bottom rated movie within the genre
bottom_rated_movie = genre_movies.sort_values(by='Rating', ascending=True).head(1)
# Print results for the current genre
print("\n * Top Rated Movie:")
print(top_rated_movie[['Name', 'Year', 'Rating']].to_string(index=False)) # Display movie details
print('\n', '_'*100, '\n')
print("\n * Bottom Rated Movie:")
print(bottom_rated_movie[['Name', 'Year', 'Rating']].to_string(index=False)) # Display movie details
print('\n', '='*100, '\n')
** Genre 1: DRAMA **
* Top Rated Movie:
Name Year Rating
Love Qubool Hai 2020 10.0
____________________________________________________________________________________________________
* Bottom Rated Movie:
Name Year Rating
Mumbai Can Dance Saalaa 2015 1.6
====================================================================================================
** Genre 2: ACTION **
* Top Rated Movie:
Name Year Rating
I'm in Love 2007 9.2
____________________________________________________________________________________________________
* Bottom Rated Movie:
Name Year Rating
Sadak 2020 1.1
====================================================================================================
** Genre 3: COMEDY **
* Top Rated Movie:
Name Year Rating
Love Sorries 2021 9.3
____________________________________________________________________________________________________
* Bottom Rated Movie:
Name Year Rating
Welcome to New York 2018 1.6
====================================================================================================
1️⃣ The dataset spans from 1931 onwards, with the shortest movie having a duration of just 45 minutes.
2️⃣ Drama emerges as the most prevalent genre, with actor Mithun leading in terms of appearances.
3️⃣ Analysis showcases both the highest-rated and the lowest-rated movies, along with their corresponding votes.
4️⃣ Directors' contributions vary significantly, with some directing a large number of movies while others direct fewer.
colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'gray', 'yellow', 'cyan', 'black', 'olive' , 'lime', 'magenta', 'lightblue', 'lightcyan', 'lightgray', 'lightgreen', 'lightpurple', 'lightyellow', 'white']
def TopFivePlot(df, column, palette=None):
if column not in df.columns:
print(f"Error: '{column}' column not found in the DataFrame.")
return
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x=column, order=df[column].value_counts().nlargest(5).index, edgecolor="white", palette=palette)
plt.xticks(rotation=0)
plt.title(f"Top Five {column}")
plt.xlabel(column)
plt.ylabel("Count")
plt.show()
TopFivePlot(movies, 'Actor 1', colors)
TopFivePlot(movies, 'Genre')
TopFivePlot(movies, 'Director', colors[7:])
TopFivePlot(movies, 'Rating', colors[4:])
# Define color sequence
colors = px.colors.qualitative.Plotly
# Create a histogram with probability density
fig_year = px.histogram(movies, x='Year', histnorm='probability density', nbins=30, color_discrete_sequence=colors)
# Update layout for aesthetics
fig_year.update_layout(
title=dict(
text='Distribution of Year',
x=0.5, # Center title horizontally
pad=dict(t=20),
font=dict(size=20) # Set title font size
),
xaxis_title='Year',
yaxis_title='Probability Density',
xaxis=dict(showgrid=False), # Hide x-axis grid lines
yaxis=dict(showgrid=False), # Hide y-axis grid lines
bargap=0.02, # Reduce bar spacing slightly
plot_bgcolor='white' # Set background color to white
)
# Display the histogram
fig_year.show()
fig_duration = px.histogram(movies, x = 'Duration', histnorm='probability density', nbins = 40, color_discrete_sequence = colors[2:3])
fig_duration.update_traces(selector=dict(type='histogram'))
fig_duration.update_layout(title='Distribution of Duration', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Duration', yaxis_title='Probability Density', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), bargap=0.02, plot_bgcolor = 'white')
fig_duration.show()
# Create a histogram plot for the 'Rating' column
fig_rating = px.histogram(
movies,
x='Rating',
histnorm='probability density',
nbins=70,
color_discrete_sequence=colors[4:5]
)
# Update the traces to be histograms
fig_rating.update_traces(type='histogram')
# Update layout settings
fig_rating.update_layout(
title='Distribution of Rating',
title_x=0.5,
title_font=dict(size=20),
xaxis_title='Rating',
yaxis_title='Probability Density',
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
bargap=0.02,
plot_bgcolor='white'
)
# Display the histogram
fig_rating.show()
#Create a violin plot for the 'Votes' column
fig_votes = px.violin(
movies,
y='Votes',
color_discrete_sequence=colors[3:4]
)
# Update layout settings
fig_votes.update_layout(
title='Distribution of Votes',
title_x=0.5,
xaxis_title='Probability Density',
yaxis_title='Votes',
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
plot_bgcolor='white'
)
# Display the violin plot
fig_votes.show()
The distribution of Year is skewed towards left with a high concentration of movies directed in between 2015-2019
The duration of movies has a guassian distribution with a very few outliers
The distribution of Rating is also having a guassian distribution with a high concentration of 6.6 and 6.7
The number of votes has a plenty of outliers
# Group by year and calculate the mean rating
year_avg_rating = movies.groupby('Year')['Rating'].mean()
# Get top 10 years by average rating
top_10_years = year_avg_rating.nlargest(10)
# Create a bar plot for top 10 years by average rating
fig = px.bar(
top_10_years.reset_index(), # Reset index to convert series to DataFrame
x='Year',
y='Rating',
title='Top 10 Years by Average Rating',
color='Rating', # Color by rating
color_continuous_scale='darkmint' # Specify color scale
)
# Update layout settings
fig.update_layout(
xaxis_title='Year',
yaxis_title='Average Rating',
plot_bgcolor='white'
)
# Display the bar plot
fig.show()
# Group data by Year and calculate the average rating
average_rating_by_year = movies.groupby('Year')['Rating'].mean().reset_index()
# Create the line plot with Plotly Express
fig = px.line(average_rating_by_year, x='Year', y='Rating', title='Trends in ratings across years', labels={'Rating': 'Average Rating'},
template='plotly_white', color_discrete_sequence= colors[9:10])
fig.show()
colors = px.colors.qualitative.Plotly # This is a built-in color sequence with enough elements
# Verify the length of the colors list
if len(colors) < 23:
print(f"Colors list has only {len(colors)} elements, using default color sequence.")
color_sequence = colors
else:
color_sequence = colors[21:23]
# Group the data and calculate the average votes by year
average_votes_by_year = movies.groupby('Year')['Votes'].mean().reset_index()
# Create the line plot with Plotly Express
fig = px.line(average_votes_by_year, x='Year', y='Votes', title='Trends in votes across years',
labels={'Votes': 'Average Votes'}, template='plotly_white', color_discrete_sequence=color_sequence)
fig.show()
Colors list has only 10 elements, using default color sequence.
# Group data by Year and Genre and calculate the average rating
average_rating_by_year_genre = movies.groupby(['Year', 'Genre'])['Rating'].mean().reset_index()
# Get the top 3 genres
top_3_genres = movies['Genre'].value_counts().head(3).index
# Filter the data to include only the top 3 genres
average_rating_top_3_genres = average_rating_by_year_genre[average_rating_by_year_genre['Genre'].isin(top_3_genres)]
# Create the line plot with Plotly Express
fig = px.line(average_rating_top_3_genres, x='Year', y='Rating', color='Genre',
title='Average Rating by Year for Top 3 Genres',
labels={'Year': 'Year', 'Rating': 'Average Rating'},
color_discrete_sequence=colors[7:11],
template='plotly_white')
# Show the plot
fig.show()
# Top Directors Analysis
director_df = movies[['Director', 'Year']].dropna()
director_df['Movie_Count'] = 1
# Get the top 20 directors by movie count
top_21_directors = director_df['Director'].value_counts().head(21).index.tolist()
top_21_director_df = director_df[director_df['Director'].isin(top_21_directors)]
# Calculate count of movies for each director
top_21_director_count = top_21_director_df.groupby('Director')['Movie_Count'].sum().reset_index()
# Plot the top 20 directors by movie count over the years
fig = px.bar(top_21_director_count, x='Director', y='Movie_Count', color='Director',
title='Top 21 Directors by Number of Movies Made Over the Years')
# Update layout
fig.update_layout(
xaxis=dict(title='Director', tickfont_size=14, showgrid=False), # Remove grid lines
yaxis=dict(title='Number of Movies', tickfont_size=14, showgrid=False), # Remove grid lines
legend_title='Director',
height=800, # Increase plot height
plot_bgcolor='white', # Change background color
)
fig.show()
# Plot the number of movies released by year
year_count = movies['Year'].value_counts().reset_index()
year_count.columns = ['Year', 'Count']
fig = px.bar(year_count, x='Year', y='Count', text='Count', title='Number of Movies Released by Year', template='plotly_white', color_discrete_sequence=colors[5:9])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(
xaxis=dict(title='Year of Movie Release', titlefont_size=17, tickfont_size=17, showgrid=False),
yaxis=dict(title='Count of Movies Released', titlefont_size=17, tickfont_size=17, showgrid=False)
)
fig.show()
# Group data by Duration and calculate the average rating
average_rating_by_duration = movies.groupby('Duration')['Rating'].mean().reset_index()
fig = px.line(average_rating_by_duration, x='Duration', y='Rating',
title='Impact of Movie Duration on Rating',
labels={'Duration': 'Duration (minutes)', 'Rating': 'Average Rating'},
template='plotly_white', color_discrete_sequence= colors[1:3])
# Customize the layout
fig.update_layout(title_x=0.5,
xaxis=dict(title='Duration of Movie in Minutes', showgrid=False),
yaxis=dict(title='Average Rating', showgrid=False))
# Show the plot
fig.show()
fig_dur_rating = px.scatter(movies, x='Duration', y='Rating', trendline='ols', color='Rating', color_continuous_scale='RdBu')
# Update layout
fig_dur_rating.update_layout(
title='Impact of Movie Duration on Rating',
title_x=0.5,
title_pad=dict(t=20),
title_font=dict(size=20),
xaxis_title='Duration of Movie (Minutes)',
yaxis_title='Rating',
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False),
plot_bgcolor='white',
legend=dict(
title='Rating',
bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)',
x=1.02, y=0.5,
xanchor='left',
font=dict(size=12)
)
)
# Show the plot
fig_dur_rating.show()
# Group data by Duration and calculate the average votes
average_votes_by_duration = movies.groupby('Duration')['Votes'].mean().reset_index()
# Create the line plot with Plotly Express
fig = px.line(average_votes_by_duration, x='Duration', y='Votes',
title='Impact of Movie Duration on Votes',
labels={'Duration': 'Duration (minutes)', 'Votes': 'Average Votes'},
template='plotly_white', color_discrete_sequence= colors[5:6])
# Customize the layout
fig.update_layout(title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20),
xaxis=dict(title='Duration of Movie in Minutes', showgrid=False),
yaxis=dict(title='Average Votes', showgrid=False))
# Show the plot
fig.show()
fig_dur_votes = px.scatter(movies, x = 'Duration', y = 'Votes', trendline='ols', color = "Votes", color_continuous_scale = "darkmint")
fig_dur_votes.update_layout(title='Does length of movie have any impact on Votes?', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Duration of Movie in Minutes', yaxis_title='Votes of a movie', xaxis=dict(showgrid=False), yaxis=dict(showgrid=False), plot_bgcolor = 'white')
fig_dur_votes.show()
# Group data by Genre and calculate the average rating
average_rating_by_genre = movies.groupby('Genre')['Rating'].mean().reset_index()
# Create the bar plot with Plotly Express
fig = px.bar(average_rating_by_genre, x='Genre', y='Rating',
title='Impact of Movie Genre on Rating',
labels={'Genre': 'Genre of Movie', 'Rating': 'Average Rating'},
template='plotly_white', color_discrete_sequence= colors[6:])
# Customize the layout
fig.update_layout(title_x=0.5,
xaxis=dict(title='Genre of Movie', showgrid=False),
yaxis=dict(title='Average Rating', showgrid=False))
# Show the plot
fig.show()
import plotly.express as px
# Group the data by 'Year' and 'Genre' and count the occurrences of each genre in each year
genre_count_by_year = movies.groupby(['Year', 'Genre']).size().reset_index(name='Count')
# Create line plot
fig_genre_trends = px.line(genre_count_by_year, x='Year', y='Count', color='Genre', title='Genre Trends Across Years',
line_shape='spline', render_mode='svg', color_discrete_sequence=['black', 'mintcream', 'red', 'yellow', 'green', 'purple'])
# Update layout
fig_genre_trends.update_layout(
xaxis_title='Year',
yaxis_title='Count',
plot_bgcolor='rgba(0,0,0,0)',
paper_bgcolor='rgba(0,0,0,0)',
legend_title='Genre',
legend=dict(
x=1.02,
y=0.5,
xanchor='left',
font=dict(size=10)
),
font=dict(
family="Arial, sans-serif",
size=12,
color="black"
),
title_font=dict(
size=20,
family="Arial, sans-serif",
color="black"
),
title_x=0.5,
title_y=0.92,
title_font_size=24
)
# Show the plot
fig_genre_trends.show()
# Duration, Rating, and Votes Analysis
dur_rat = movies[['Duration', 'Rating', 'Votes']].dropna()
# Inspecting the data
print(dur_rat['Duration'].unique())
# Clean and convert 'Duration' column
dur_rat['Duration'] = dur_rat['Duration'].astype(str).str.replace(' min', '')
dur_rat['Duration'] = dur_rat['Duration'].str.replace(r'[^\d.]', '', regex=True).astype(float).astype(int)
# Clean and convert 'Votes' column
dur_rat['Votes'] = dur_rat['Votes'].astype(str).str.replace(',', '').astype(float)
# 3D Scatter Plot
fig = px.scatter_3d(dur_rat, x='Duration', y='Rating', z='Votes', color='Rating', title='3D Plot of Duration, Rating, and Votes', color_continuous_scale='RdBu')
fig.show()
[109. 110. 147. 142. 82. 116. 96. 120. 161. 166. 102. 87. 132. 105. 146. 168. 158. 94. 124. 157. 107. 113. 80. 122. 149. 130. 121. 126. 188. 115. 103. 114. 170. 100. 99. 140. 128. 93. 125. 145. 75. 111. 134. 104. 92. 136. 137. 127. 119. 90. 150. 151. 95. 112. 143. 177. 117. 148. 123. 144. 72. 154. 175. 153. 78. 138. 139. 133. 180. 135. 164. 162. 171. 160. 152. 163. 165. 141. 129. 156. 200. 172. 88. 155. 167. 106. 193. 108. 195. 174. 81. 178. 184. 97. 176. 169. 131. 77. 91. 86. 84. 173. 118. 181. 101. 79. 183. 159. 83. 89. 186. 74. 85. 98. 76. 185. 187. 73. 191. 199. 179. 190. 189. 197. 182. 192.]
peak year for highest average ratings is 1944.1984 to 1993, coinciding with the fare.consistently increasing since 2013.2010, movies received the highest average votes.Drama genre maintains the highest average ratings since its inception.1953, while Action genre started in 1964.Short-duration movies tend to receive higher ratings and higher votes, suggesting a preference for concise storytelling.# Dropping non essential columns
movies.drop('Name', axis = 1, inplace = True)
movies.head()
| Year | Duration | Genre | Rating | Votes | Director | Actor 1 | Actor 2 | Actor 3 | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 2019 | 109.0 | Drama | 7.0 | 8.0 | Gaurav Bakshi | Rasika Dugal | Vivek Ghamande | Arvind Jangid |
| 3 | 2019 | 110.0 | Comedy | 4.4 | 35.0 | Ovais Khan | Prateik | Ishita Raj | Siddhant Kapoor |
| 3 | 2019 | 110.0 | Romance | 4.4 | 35.0 | Ovais Khan | Prateik | Ishita Raj | Siddhant Kapoor |
| 5 | 1997 | 147.0 | Comedy | 4.7 | 827.0 | Rahul Rawail | Bobby Deol | Aishwarya Rai Bachchan | Shammi Kapoor |
| 5 | 1997 | 147.0 | Drama | 4.7 | 827.0 | Rahul Rawail | Bobby Deol | Aishwarya Rai Bachchan | Shammi Kapoor |
def add_mean_ratings(movies):
genre_means = movies.groupby('Genre')['Rating'].mean()
director_means = movies.groupby('Director')['Rating'].mean()
actor_means = {col: movies.groupby(col)['Rating'].mean() for col in ['Actor 1', 'Actor 2', 'Actor 3']}
movies['Genre_mean_rating'] = movies['Genre'].map(genre_means)
movies['Director_encoded'] = movies['Director'].map(director_means)
for actor, mean in actor_means.items():
movies[f"{actor}_encoded"] = movies[actor].map(mean)
return movies
df = movies.copy(deep=True)
movies_encoded = add_mean_ratings(df)
movies_encoded.head()
| Year | Duration | Genre | Rating | Votes | Director | Actor 1 | Actor 2 | Actor 3 | Genre_mean_rating | Director_encoded | Actor 1_encoded | Actor 2_encoded | Actor 3_encoded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2019 | 109.0 | Drama | 7.0 | 8.0 | Gaurav Bakshi | Rasika Dugal | Vivek Ghamande | Arvind Jangid | 6.056998 | 7.000000 | 6.850000 | 7.00 | 7.00 |
| 3 | 2019 | 110.0 | Comedy | 4.4 | 35.0 | Ovais Khan | Prateik | Ishita Raj | Siddhant Kapoor | 5.752085 | 4.400000 | 5.250000 | 4.40 | 4.46 |
| 3 | 2019 | 110.0 | Romance | 4.4 | 35.0 | Ovais Khan | Prateik | Ishita Raj | Siddhant Kapoor | 5.812772 | 4.400000 | 5.250000 | 4.40 | 4.46 |
| 5 | 1997 | 147.0 | Comedy | 4.7 | 827.0 | Rahul Rawail | Bobby Deol | Aishwarya Rai Bachchan | Shammi Kapoor | 5.752085 | 5.335135 | 4.793617 | 5.73 | 5.93 |
| 5 | 1997 | 147.0 | Drama | 4.7 | 827.0 | Rahul Rawail | Bobby Deol | Aishwarya Rai Bachchan | Shammi Kapoor | 6.056998 | 5.335135 | 4.793617 | 5.73 | 5.93 |
This streamlined feature engineering process ensures a concise yet effective dataset for model training.
features = movies_encoded.drop(columns=['Rating', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Genre']).values
target = movies_encoded['Rating'].values
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
print("Training set features shape:", x_train.shape)
print("Testing set features shape:", x_test.shape)
print("Training set target shape:", y_train.shape)
print("Testing set target shape:", y_test.shape)
Training set features shape: (9570, 8) Testing set features shape: (2393, 8) Training set target shape: (9570,) Testing set target shape: (2393,)
x_train_path = r'C:\Users\acer\Downloads\x_train.csv'
if os.path.exists(x_train_path):
os.remove(x_train_path)
x_train_df = pd.DataFrame(data=x_train)
x_train_df.to_csv(x_train_path, index=False)
print("Training features saved successfully!")
# Save the testing features
x_test_path = r'C:\Users\acer\Downloads\x_test.csv'
if os.path.exists(x_test_path):
os.remove(x_test_path)
x_test_df = pd.DataFrame(data=x_test)
x_test_df.to_csv(x_test_path, index=False)
print("Testing features saved successfully!")
# Save the training features
y_train_path = r'C:\Users\acer\Downloads\y_train.csv'
if os.path.exists(y_train_path):
os.remove(y_train_path)
y_train_df = pd.DataFrame(data=y_train)
y_train_df.to_csv(y_train_path, index=False)
print("Training features saved successfully!")
# Save the testing features
y_test_path = r'C:\Users\acer\Downloads\y_test.csv'
if os.path.exists(y_test_path):
os.remove(y_test_path)
y_test_df = pd.DataFrame(data=y_test)
y_test_df.to_csv(y_test_path, index=False)
print("Testing features saved successfully!")
Kbest_reg = SelectKBest(score_func=f_regression, k=7)
Kbest_reg.fit(x_train, y_train)
# what are scores for the features
for i in range(len(Kbest_reg.scores_)):
print(f'Feature {i} : {round(Kbest_reg.scores_[i],3)}')
print()
# plot the scores
plt.bar([i for i in range(len(Kbest_reg.scores_))], Kbest_reg.scores_)
plt.show()
Feature 0 : 545.877 Feature 1 : 16.196 Feature 2 : 229.87 Feature 3 : 517.18 Feature 4 : 17218.75 Feature 5 : 10987.842 Feature 6 : 12371.432 Feature 7 : 13242.967
x_train_reg = Kbest_reg.transform(x_train)
print("X_train.shape: {}".format(x_train.shape))
print()
print("X_train_selected.shape: {}".format(x_train_reg.shape))
print()
# transform test data
x_test_reg = Kbest_reg.transform(x_test)
X_train.shape: (9570, 8) X_train_selected.shape: (9570, 7)
def model_regressor(regressor, x_train, y_train, x_test, y_test):
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2_train = regressor.score(x_train, y_train)
r2_test = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared Score(R2 ) on Train Data: {r2_train:.4f}")
print(f"R-squared Score(R2 ) on Test Data: {r2_test:.4f}")
# Evaluation
model_evaluation_visualize(regressor, x_test, y_test)
def model_evaluation_visualize(regressor, x_test, y_test):
y_pred = regressor.predict(x_test)
# Plot predictions vs actual values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', edgecolors=(0, 0, 0))
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=3)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Values')
plt.show()
def cross_validation(regressor, x_train, y_train):
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(regressor, x_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
return scores
def kfold_cross_validation(classifier, x_train, y_train, cv, scoring= 'neg_mean_absolute_error'):
# Use stratified k-fold for classification problems if the classifier supports probability prediction
if hasattr(classifier, 'predict_proba'):
kfold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
else:
kfold = KFold(n_splits=cv, shuffle=True, random_state=42)
scores = []
for train_index, test_index in kfold.split(x_train, y_train):
x_train_fold, x_test_fold = x_train[train_index], x_train[test_index]
y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
classifier.fit(x_train_fold, y_train_fold)
y_pred = classifier.predict(x_test_fold)
score = scoring(y_test_fold, y_pred) # Use the provided scoring function
scores.append(score)
return scores
lr = LinearRegression()
dtr = DecisionTreeRegressor(random_state=42)
ridge = Ridge(random_state=42)
gbr= GradientBoostingRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)
xgb = XGBRegressor()
model_regressor(rf, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.1185 Root Mean Squared Error (RMSE): 0.3442 Mean Absolute Error (MAE): 0.1858 R-squared Score(R2 ) on Train Data: 0.9921 R-squared Score(R2 ) on Test Data: 0.9376
model_regressor(lr, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.4566 Root Mean Squared Error (RMSE): 0.6758 Mean Absolute Error (MAE): 0.4991 R-squared Score(R2 ) on Train Data: 0.7721 R-squared Score(R2 ) on Test Data: 0.7595
model_regressor(ridge, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.4566 Root Mean Squared Error (RMSE): 0.6758 Mean Absolute Error (MAE): 0.4991 R-squared Score(R2 ) on Train Data: 0.7721 R-squared Score(R2 ) on Test Data: 0.7595
model_regressor(gbr, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.3392 Root Mean Squared Error (RMSE): 0.5824 Mean Absolute Error (MAE): 0.4197 R-squared Score(R2 ) on Train Data: 0.8549 R-squared Score(R2 ) on Test Data: 0.8213
model_regressor(dtr, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.1389 Root Mean Squared Error (RMSE): 0.3727 Mean Absolute Error (MAE): 0.1230 R-squared Score(R2 ) on Train Data: 1.0000 R-squared Score(R2 ) on Test Data: 0.9262
model_regressor(xgb, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.1654 Root Mean Squared Error (RMSE): 0.4067 Mean Absolute Error (MAE): 0.2579 R-squared Score(R2 ) on Train Data: 0.9747 R-squared Score(R2 ) on Test Data: 0.9129
model_regressor(xgb, x_train, y_train, x_test, y_test)
Mean Squared Error (MSE): 0.1654 Root Mean Squared Error (RMSE): 0.4067 Mean Absolute Error (MAE): 0.2579 R-squared Score(R2 ) on Train Data: 0.9747 R-squared Score(R2 ) on Test Data: 0.9129
def adjusted_r2_score(r2, n, k):
return 1 - (1 - r2) * (n - 1) / (n - k - 1)
def compare_models_metrics(regressors, x_train, y_train, x_test, y_test):
metrics = ['Train R2', 'Test R2', 'Train Adj R2', 'Test Adj R2', 'Train MSE', 'Test MSE', 'Train MAE', 'Test MAE', 'Cross Validation Mean Score', 'Cross Validation Std Score']
metrics_values = []
names = []
for name, regressor in regressors:
regressor.fit(x_train, y_train)
prediction = regressor.predict(x_test)
# Calculate metrics
train_r2 = r2_score(y_train, regressor.predict(x_train))
test_r2 = r2_score(y_test, prediction)
n_train = len(y_train)
n_test = len(y_test)
k = x_train.shape[1]
train_adj_r2 = adjusted_r2_score(train_r2, n_train, k)
test_adj_r2 = adjusted_r2_score(test_r2, n_test, k)
train_mse = mean_squared_error(y_train, regressor.predict(x_train))
test_mse = mean_squared_error(y_test, prediction)
train_mae = mean_absolute_error(y_train, regressor.predict(x_train))
test_mae = mean_absolute_error(y_test, prediction)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
cv_scores = cross_val_score(regressor, x_train, y_train, cv=cv, scoring='r2')
cross_val_score_mean = cv_scores.mean()
cross_val_score_std = cv_scores.std()
metrics_values.append([train_r2, test_r2, train_adj_r2, test_adj_r2, train_mse, test_mse, train_mae, test_mae, cross_val_score_mean, cross_val_score_std])
names.append(name)
# Print metrics
print('\n')
print(f"Model: {name}")
print(f"Train R2: {train_r2:.2f}")
print(f"Test R2: {test_r2:.2f}")
print(f"Train Adj R2: {train_adj_r2:.2f}")
print(f"Test Adj R2: {test_adj_r2:.2f}")
print(f"Train MSE: {train_mse:.2f}")
print(f"Test MSE: {test_mse:.2f}")
print(f"Train MAE: {train_mae:.2f}")
print(f"Test MAE: {test_mae:.2f}")
print(f"Cross Validation Mean Score: {cross_val_score_mean:.2f}")
print(f"Cross Validation Std Score: {cross_val_score_std:.2f}")
print("-" * 50, end="\n")
df_metrics = pd.DataFrame(metrics_values, columns=metrics)
df_metrics['Model'] = names
return df_metrics
def compare_models_metrics_heatmap(df_metrics):
metrics = ['Train R2', 'Test R2', 'Train Adj R2', 'Test Adj R2', 'Train MSE', 'Test MSE', 'Train MAE', 'Test MAE', 'Cross Validation Mean Score', 'Cross Validation Std Score']
# Plot heatmap
plt.figure(figsize=(13, 11))
sns.heatmap(df_metrics.set_index('Model')[metrics], annot=True, cmap='YlGnBu', fmt=".2f")
plt.title('Comparison of Regressor Metrics')
plt.xlabel('Metric')
plt.ylabel('Regressor')
plt.tight_layout()
plt.show()
# Define regression models
regressors = [
('Linear Regression', LinearRegression()),
('Decision Tree Regressor', DecisionTreeRegressor(random_state=42)),
('Ridge', Ridge(random_state=42)),
('Gradient Boosting Regressor', GradientBoostingRegressor(random_state=42)),
('Random Forest Regressor', RandomForestRegressor(random_state=42)),
('XGB Regressor', XGBRegressor(random_state=42))
]
# Example usage:
# Assuming x_train, y_train, x_test, y_test are defined and preprocessed
df_metrics = compare_models_metrics(regressors, x_train, y_train, x_test, y_test)
Model: Linear Regression Train R2: 0.78 Test R2: 0.77 Train Adj R2: 0.78 Test Adj R2: 0.77 Train MSE: 0.40 Test MSE: 0.43 Train MAE: 0.48 Test MAE: 0.48 Cross Validation Mean Score: 0.78 Cross Validation Std Score: 0.02 -------------------------------------------------- Model: Decision Tree Regressor Train R2: 1.00 Test R2: 0.93 Train Adj R2: 1.00 Test Adj R2: 0.93 Train MSE: 0.00 Test MSE: 0.14 Train MAE: 0.00 Test MAE: 0.12 Cross Validation Mean Score: 0.90 Cross Validation Std Score: 0.02 -------------------------------------------------- Model: Ridge Train R2: 0.78 Test R2: 0.77 Train Adj R2: 0.78 Test Adj R2: 0.77 Train MSE: 0.40 Test MSE: 0.43 Train MAE: 0.48 Test MAE: 0.48 Cross Validation Mean Score: 0.78 Cross Validation Std Score: 0.02 -------------------------------------------------- Model: Gradient Boosting Regressor Train R2: 0.85 Test R2: 0.81 Train Adj R2: 0.85 Test Adj R2: 0.81 Train MSE: 0.29 Test MSE: 0.35 Train MAE: 0.39 Test MAE: 0.43 Cross Validation Mean Score: 0.83 Cross Validation Std Score: 0.02 --------------------------------------------------
compare_models_metrics_heatmap(df_metrics)
# Define models
models = [
('LR', LinearRegression()),
('DTR', DecisionTreeRegressor(random_state=42)),
('Ridge', Ridge(random_state=42)),
('GBR', GradientBoostingRegressor(random_state=42)),
('RF', RandomForestRegressor(random_state=42)),
('XGB', XGBRegressor(random_state=42))
]
# Function to plot residuals
def plot_residuals(model, model_name):
visualizer = ResidualsPlot(model, hist=True, figsize=(15, 6), colors=["black", "darkgoldenrod"])
# Fit the visualizer with the training data
visualizer.fit(x_train, y_train)
# Evaluate the model on the test data
visualizer.score(x_test, y_test)
# Customizing the plot
visualizer.ax.set_title(f"Residuals Plot for {model_name}")
visualizer.ax.set_xlabel("Predicted Value")
visualizer.ax.set_ylabel("Residuals")
visualizer.ax.grid(True, which='both', linestyle='--', linewidth=0.5)
# Render the plot
visualizer.show()
# Plot residuals for all models
for name, model in models:
print(f"Plotting residuals for {name}...")
plot_residuals(model, name)
Plotting residuals for LR...
Plotting residuals for DTR...
Plotting residuals for Ridge...
Plotting residuals for GBR...
Plotting residuals for RF...
Plotting residuals for XGB...
# Suppress warnings
import warnings
warnings.filterwarnings("ignore", message="X has feature names")
# Define your data
data = {
'Year': [2019],
'Votes': [141],
'Duration': [3581],
'Genre_mean_rating': [5.6],
'Director_encoded': [6.83],
'Actor1_encoded': [6.2],
'Actor2_encoded': [5.1],
'Actor3_encoded': [6.4]
}
# Create DataFrame
df = pd.DataFrame(data)
# Define models
models = {
'Linear Regression': lr,
'Decision Tree Regressor': dtr,
'Ridge': ridge,
'Gradient Boosting Regressor': gbr,
'Random Forest Regressor': rf,
'XGBoost Regressor': xgb
}
# Make predictions using each model
for name, model in models.items():
print(f"Predictions using {name}:")
predicted_rating = model.predict(df)
print("Predicted Rating:", predicted_rating[0])
print()
# Define a dictionary to store predicted ratings for each model
predicted_ratings = {}
# Predict ratings for each model
for name, model in models.items():
predicted_rating = model.predict(df)
predicted_ratings[name] = predicted_rating[0]
# Find the model with the closest predicted rating to the original rating (6.6)
original_rating = 6.6
best_model = min(predicted_ratings, key=lambda x: abs(predicted_ratings[x] - original_rating))
print(f"The best model is {best_model} with predicted rating of {predicted_ratings[best_model]}")
Predictions using Linear Regression: Predicted Rating: 6.3596194204496665 Predictions using Decision Tree Regressor: Predicted Rating: 6.6 Predictions using Ridge: Predicted Rating: 6.359568775313514 Predictions using Gradient Boosting Regressor: Predicted Rating: 6.1947144654586825 Predictions using Random Forest Regressor: Predicted Rating: 6.360000000000002 Predictions using XGBoost Regressor: Predicted Rating: 5.972795 The best model is Decision Tree Regressor with predicted rating of 6.6
def predict_movie_rating(model, input_data):
print("Input data:", input_data)
# Convert input data to a list
input_data_as_list = list(input_data.values())
print("\nInput data as list:", input_data_as_list)
# Reshape the list as we are predicting for only one instance
input_data_reshaped = np.array(input_data_as_list).reshape(1, -1)
print("\nInput data reshaped:", input_data_reshaped)
# Make prediction using the model
prediction = model.predict(input_data_reshaped)
print("\n\n_________________________________\nPrediction:", prediction)
# Return the prediction
return prediction[0]
# Function to take input from the user
def get_user_input():
year = int(input("Enter the year: "))
votes = int(input("Enter the number of votes: "))
duration = int(input("Enter the duration (in minutes): "))
genre_mean_rating = float(input("Enter the genre mean rating: "))
director_encoded = float(input("Enter the director encoded value: "))
actor1_encoded = float(input("Enter the actor1 encoded value: "))
actor2_encoded = float(input("Enter the actor2 encoded value: "))
actor3_encoded = float(input("Enter the actor3 encoded value: "))
# Convert all input data to a dictionary
input_data = {
'Year': year,
'Votes': votes,
'Duration': duration,
'Genre_mean_rating': genre_mean_rating,
'Director_encoded': director_encoded,
'Actor1_encoded': actor1_encoded,
'Actor2_encoded': actor2_encoded,
'Actor3_encoded': actor3_encoded
}
return input_data
# Load your RandomForestRegressor model before making predictions
# rf = joblib.load('your_model_path.pkl')
# Example usage
input_data = get_user_input()
result = predict_movie_rating(dtr, input_data)
#data = {'Year': [2019], 'Votes': [141], 'Duration': [3581], 'Genre_mean_rating': [5.633333333], 'Director_encoded': [6.833333333], 'Actor1_encoded': [6.225806452], 'Actor2_encoded': [5.057462687], 'Actor3_encoded': [6.44]}
# Print results
print("\n\nIndividual Input Data:", input_data, "\n\nPrediction:", result)
Enter the year: 2010
Enter the number of votes: 141
Enter the duration (in minutes): 3581
Enter the genre mean rating: 5.6
Enter the director encoded value: 6.83
Enter the actor1 encoded value: 6.2
Enter the actor2 encoded value: 5.1
Enter the actor3 encoded value: 6.4
Input data: {'Year': 2010, 'Votes': 141, 'Duration': 3581, 'Genre_mean_rating': 5.6, 'Director_encoded': 6.83, 'Actor1_encoded': 6.2, 'Actor2_encoded': 5.1, 'Actor3_encoded': 6.4}
Input data as list: [2010, 141, 3581, 5.6, 6.83, 6.2, 5.1, 6.4]
Input data reshaped: [[2010. 141. 3581. 5.6 6.83 6.2 5.1 6.4 ]]
_________________________________
Prediction: [6.6]
Individual Input Data: {'Year': 2010, 'Votes': 141, 'Duration': 3581, 'Genre_mean_rating': 5.6, 'Director_encoded': 6.83, 'Actor1_encoded': 6.2, 'Actor2_encoded': 5.1, 'Actor3_encoded': 6.4}
Prediction: 6.6
## Saving Model
import joblib
filename = "DTR_Regressor.sav"
joblib.dump(dtr, open(filename, 'wb'))
loaded_model_DTR = joblib.load("DTR_Regressor.sav")
result = loaded_model_DTR.score(x_test, y_test)
print(result)
0.9262468361216416
rf = RandomForestRegressor(random_state=42)
rf.fit(x_train, y_train)
rf.score(x_test, y_test)
0.9383800492163734
## Saving Model
import joblib
filename = "RFR_Regressor.sav"
joblib.dump(rf, open(filename, 'wb'))
loaded_model_RFR = joblib.load("RFR_Regressor.sav")
result = loaded_model_RFR.score(x_test, y_test)
print(result)
0.9383800492163734